Converting a ggplot2 scatterplot

create an interactive version of the below scatterplot, exploring the relationship between video game sales in North America (NA_sales) and aggregate critic score (Critic_Score) in 2016.

# Load the plotly package
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.1     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
vgsales <- read.csv('C:/Users/jihun/Downloads/vgsales.csv')
# Store the scatterplot of Critic_Score vs. NA_Sales sales in 2016
scatter <- vgsales %>%
            filter(Year == 2016) %>%
            ggplot(aes(x = NA_Sales, y = Critic_Score)) +
            geom_point(alpha = 0.3)

# Convert the scatterplot to a plotly graphic
ggplotly(scatter)

Histograms

create a histogram from scratch in order to explore the distribution of the critic scores of video games sold between 1980 and 2016

# Create a histogram of Critic_Score
vgsales %>%
    plot_ly(x = ~Critic_Score) %>%
    add_histogram()               
## Warning: Ignoring 8467 observations

Bar charts

create a bar chart from scratch in order to explore the distribution of the video game genres represented in games sold between 1980 and 2016.

# Create a frequency for Genre
genre_table <- vgsales %>%
    count(Genre)

# Reorder the bars for Genre by n
genre_table %>%
    mutate(Genre = fct_reorder(Genre, n,.desc = TRUE)) %>%
    plot_ly(x = ~Genre, y = ~n) %>% 
    add_bars()                      
## Warning: Ignoring 1 observations

Scatterplot

create a scatterplot in plotly to explore the relationship between the average player score (User_Score) and the average critic score (Critic_Score)

# Create a scatter plot of User_Score against Critic_Score
vgsales %>% 
  plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
  add_markers()
## Warning: Ignoring 8500 observations

Stacked bar chart

create a stacked bar chart to investigate whether there is an association between the Genre and Rating of video games.

# Filter out the 2016 video games
vg2016 <- vgsales %>%
    filter(Year == 2016)

# Create a stacked bar chart of Rating by Genre
vg2016 %>%
    count(Genre, Rating) %>%
    plot_ly(x = ~Genre, y = ~n, color = ~Rating) %>%
    add_bars() %>%
    layout(barmode = 'stack')

Boxplot

create a boxplot of global video game sales (the number of units sold) for each genre.

# Filter out the 2016 video games
vgsales2016 <- vgsales %>%
    filter(Year == 2016)

# Create boxplots of Global_Sales by Genre for above data
vgsales2016 %>% 
  plot_ly(x=~Global_Sales, y=~Genre) %>%
  add_boxplot()

Customization

Color and Opacity

Increasing the transparency (i.e. decreasing the opacity) of a trace can help improve its readability.

# Create a histogram of Critic_Score with navy bars that are 50% transparent
vgsales2016 %>%
  plot_ly(x = ~Critic_Score) %>%
  add_histogram(color = I('navy'), opacity = 0.5)
## Warning: Ignoring 270 observations

Alternative color formats

# Change the color of the histogram using a hex code
vgsales2016 %>%
  plot_ly(x = ~Critic_Score) %>%
  add_histogram(color=I('#111e6c'))
## Warning: Ignoring 270 observations
# Change the color of the histogram using rgb()
vgsales2016 %>%
  plot_ly(x = ~Critic_Score) %>%
  add_histogram(marker = list(color = "rgb(17, 30, 108)"))
## Warning: Ignoring 270 observations

Size and symbol

changing the plotting symbol can make charts easier to read by addressing issues such as overplotting Decreasing the size of points can also help overcome minor overplotting issues

# Set the plotting symbol to diamond and the size to 4
plot_ly(data = vg2016, x = ~User_Score, y = ~Critic_Score) %>% 
    add_markers(marker = list(symbol = 'diamond', size = 4)) 
## Warning: Ignoring 270 observations

Adding a third variable

# Use color to add Genre as a third variable
vgsales2016 %>%
  plot_ly(x=~Critic_Score, y=~User_Score, color=~Genre) %>%
  add_markers(colors='Dark2')
## Warning: Ignoring 270 observations
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

Beyond colors: Symbols

Using both color and shape to encode a categorical variable can be a useful strategy to reveal relationships more effectively. This double-encoding strategy uses two pre-attentive visual cues to more-quickly communicate information to the reader. This idea is implemented by default in plotly; if you map a variable to the symbol, plotly automatically maps the variable to the color.

# Create a scatterplot of User_Score against Critic_Score coded by Rating
vgsales2016 %>%
   plot_ly(x=~Critic_Score, y=~User_Score, color=~Rating, symbol=~Rating) %>%
   add_markers()
## Warning: Ignoring 270 observations
## Warning: The following are not valid symbol codes:
## 'NA'
## Valid symbols include:
## '0', 'circle', '100', 'circle-open', '200', 'circle-dot', '300', 'circle-open-dot', '1', 'square', '101', 'square-open', '201', 'square-dot', '301', 'square-open-dot', '2', 'diamond', '102', 'diamond-open', '202', 'diamond-dot', '302', 'diamond-open-dot', '3', 'cross', '103', 'cross-open', '203', 'cross-dot', '303', 'cross-open-dot', '4', 'x', '104', 'x-open', '204', 'x-dot', '304', 'x-open-dot', '5', 'triangle-up', '105', 'triangle-up-open', '205', 'triangle-up-dot', '305', 'triangle-up-open-dot', '6', 'triangle-down', '106', 'triangle-down-open', '206', 'triangle-down-dot', '306', 'triangle-down-open-dot', '7', 'triangle-left', '107', 'triangle-left-open', '207', 'triangle-left-dot', '307', 'triangle-left-open-dot', '8', 'triangle-right', '108', 'triangle-right-open', '208', 'triangle-right-dot', '308', 'triangle-right-open-dot', '9', 'triangle-ne', '109', 'triangle-ne-open', '209', 'triangle-ne-dot', '309', 'triangle-ne-open-dot', '10', 'triangle-se', '110', 'triangle-se-open', '210', 'triangle-se-dot', '310', 'triangle-se-open-dot', '11', 'triangle-sw', '111', 'triangle-sw-open', '211', 'triangle-sw-dot', '311', 'triangle-sw-open-dot', '12', 'triangle-nw', '112', 'triangle-nw-open', '212', 'triangle-nw-dot', '312', 'triangle-nw-open-dot', '13', 'pentagon', '113', 'pentagon-open', '213', 'pentagon-dot', '313', 'pentagon-open-dot', '14', 'hexagon', '114', 'hexagon-open', '214', 'hexagon-dot', '314', 'hexagon-open-dot', '15', 'hexagon2', '115', 'hexagon2-open', '215', 'hexagon2-dot', '315', 'hexagon2-open-dot', '16', 'octagon', '116', 'octagon-open', '216', 'octagon-dot', '316', 'octagon-open-dot', '17', 'star', '117', 'star-open', '217', 'star-dot', '317', 'star-open-dot', '18', 'hexagram', '118', 'hexagram-open', '218', 'hexagram-dot', '318', 'hexagram-open-dot', '19', 'star-triangle-up', '119', 'star-triangle-up-open', '219', 'star-triangle-up-dot', '319', 'star-triangle-up-open-dot', '20', 'star-triangle-down', '120', 'star-triangle-down-open', '220', 'star-triangle-down-dot', '320', 'star-triangle-down-open-dot', '21', 'star-square', '121', 'star-square-open', '221', 'star-square-dot', '321', 'star-square-open-dot', '22', 'star-diamond', '122', 'star-diamond-open', '222', 'star-diamond-dot', '322', 'star-diamond-open-dot', '23', 'diamond-tall', '123', 'diamond-tall-open', '223', 'diamond-tall-dot', '323', 'diamond-tall-open-dot', '24', 'diamond-wide', '124', 'diamond-wide-open', '224', 'diamond-wide-dot', '324', 'diamond-wide-open-dot', '25', 'hourglass', '125', 'hourglass-open', '26', 'bowtie', '126', 'bowtie-open', '27', 'circle-cross', '127', 'circle-cross-open', '28', 'circle-x', '128', 'circle-x-open', '29', 'square-cross', '129', 'square-cross-open', '30', 'square-x', '130', 'square-x-open', '31', 'diamond-cross', '131', 'diamond-cross-open', '32', 'diamond-x', '132', 'diamond-x-open', '33', 'cross-thin', '133', 'cross-thin-open', '34', 'x-thin', '134', 'x-thin-open', '35', 'asterisk', '135', 'asterisk-open', '36', 'hash', '136', 'hash-open', '236', 'hash-dot', '336', 'hash-open-dot', '37', 'y-up', '137', 'y-up-open', '38', 'y-down', '138', 'y-down-open', '39', 'y-left', '139', 'y-left-open', '40', 'y-right', '140', 'y-right-open', '41', 'line-ew', '141', 'line-ew-open', '42', 'line-ns', '142', 'line-ns-open', '43', 'line-ne', '143', 'line-ne-open', '44', 'line-nw', '144', 'line-nw-open

Transforming a color scale

When mapping a numeric variable to color, sometimes it is necessary to transform the variable. This is especially true if the variable values differ by an order of magnitude or more. Explore how the number of users helps explain the relationship between user and critic scores for video games in 2016. Explore how applying the natural log can help make a color scale more interpretable

# Create a scatterplot of User_Score vs. Critic_Score colored by User_Count
vgsales2016 %>%
  plot_ly(x = ~Critic_Score, y = ~User_Score, color=~User_Count) %>%
  add_markers()
## Warning: Ignoring 270 observations

Removing a piece of hover info

remove the platform labels from the hover info for the bar chart displaying the number of games released for each platform in 2016.

# Create a bar chart of Platform with hoverinfo only for the bar heights
vgsales2016 %>%
    count(Platform) %>%
    plot_ly(x=~Platform, y=~n, hoverinfo='y') %>%
    add_bars()

Adding to hoverinfo

add an identifying column rather than polish your chart for publication on the web. This is possible by adding the text argument to the plot_ly() command without specifying hoverinfo = “text”.

# Create a scatterplot of User_Score vs. Critic score
vgsales2016 %>%
    # Add video game Name to the hover info text
    plot_ly(x = ~Critic_Score, y = ~User_Score, text = ~Name) %>% 
    add_markers()
## Warning: Ignoring 270 observations

Custom Hoverinfo

Customize the hover info to help explore the relationship between North American video game sales and European video game sales in 2016. identify the video games with the greatest discrepancy between the North American and European sales.

# Format the hover info for NA_Sales, EU_Sales, and Name
vgsales2016 %>%
  plot_ly(x = ~NA_Sales, y = ~EU_Sales,
          hoverinfo = 'text',
          text = ~paste('NA_Sales: ', NA_Sales, '<br>',
                     'EU_Sales: ', EU_Sales, '<br>',
                     'Name: ', Name)
  ) %>%
  add_markers()

Polishing a scatterplot

display global sales on the log scale

# Polish the scatterplot by transforming the x-axis and labeling both axes
vgsales2016 %>%
  plot_ly(x = ~Global_Sales, y = ~Critic_Score) %>%
  add_markers(marker = list(opacity = 0.5)) %>%
  layout(xaxis = list(title="Global sales (millions of units)", type='log'),
         yaxis = list(title="Critic score"))
## Warning: Ignoring 270 observations

Matching a theme

Only horizontal grid lines A light gray background (#ebebeb) around your plot

# Set the background color to #ebebeb and remove the vertical grid
vgsales %>%
  plot_ly(x = ~Year, y = ~Global_Sales) %>%
  add_lines() %>%
  layout(xaxis=list(showgrid=FALSE),paper_bgcolor="#ebebeb")

Adding a linear smoother

add a linear smoother to a scatterplot of user score against critic score for video games in 2016

# Fit the regression model of User_Score on Critic_Score
vgsales1 <- vgsales[complete.cases(vgsales),]
m <- lm(User_Score ~ Critic_Score, data = vgsales1,na.action=na.omit)

# Create the scatterplot with smoother
vgsales1 %>%
   select(User_Score, Critic_Score) %>%
   na.omit() %>%
   plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
   add_markers(showlegend = FALSE) %>%
   add_lines(y = ~fitted(m))
## Warning: Can't display both discrete & non-discrete data on same axis

Overlayed density plots

create density plots and overlay them to compare the distribution of critic scores for three video game publishers: Activision, Electronic Arts, and Nintendo. The fill = ‘tozeroy’ argument fills the area under the curve.

# Compute density curves
activision <- vgsales2016[vgsales2016$Publisher=='Activision',]
ea <- vgsales2016[vgsales2016$Publisher=='Electronic Arts',]
nintendo <- vgsales2016[vgsales2016$Publisher=='Nintendo',]
d.a <- density(activision$Critic_Score, na.rm = TRUE)
d.e <- density(ea$Critic_Score, na.rm = TRUE)
d.n <- density(nintendo$Critic_Score, na.rm = TRUE)

# Overlay density plots
plot_ly() %>%
  add_lines(x = ~d.a$x, y = ~d.a$y, name = "Activision", fill = 'tozeroy') %>%
  add_lines(x = ~d.e$x, y = ~d.e$y, name = "Electronic Arts", fill = 'tozeroy') %>%
  add_lines(x = ~d.n$x, y = ~d.n$y, name = "Nintendo", fill = 'tozeroy') %>%
  layout(xaxis = list(title = 'Critic Score'),
         yaxis = list(title = 'Density'))

Manual faceting

subplot() command allows you to combine charts to create facets (i.e. subplots or small multiples). This is a great way to explore distributions and relationships across factors.

# Create a scatterplot of User_Score against Critic_Score for PS4 games
p1 <- vgsales2016 %>%
   filter(Platform == "PS4") %>%
   plot_ly(x = ~Critic_Score, y = ~User_Score) %>% 
   add_markers(name = "PS4")

# Create a scatterplot of User_Score against Critic_Score for XOne games
p2 <- vgsales2016 %>%
   filter(Platform == "XOne") %>%
   plot_ly(x = ~Critic_Score, y = ~User_Score) %>% 
   add_markers(name = "XOne")

# Create a facted scatterplot containing p1 and p2
subplot(p1, p2, nrows=2)
## Warning: Ignoring 66 observations
## Warning: Ignoring 27 observations

Automated faceting

automate the process of creating a facetted scatterplot with 12 facets

# Create a faceted scatterplot of User_Score vs. Critic_Score with 3 rows
vgsales2016 %>%
  group_by(Platform) %>%
  do(
    plot = plot_ly(data = ., x=~Critic_Score, y=~User_Score) %>%
      add_markers(name = ~Platform)
  ) %>%
  subplot(nrows = 3, shareY = TRUE, shareX = TRUE)
## Warning: Ignoring 36 observations
## Warning: Ignoring 9 observations
## Warning: Ignoring 38 observations
## Warning: Ignoring 66 observations
## Warning: Ignoring 75 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 5 observations
## Warning: Ignoring 13 observations
## Warning: Ignoring 27 observations
## Warning: Can't display both discrete & non-discrete data on same axis

## Warning: Can't display both discrete & non-discrete data on same axis

## Warning: Can't display both discrete & non-discrete data on same axis

Plot and axis titles

add titles to subplots

# Add x-axis and y-axis labels, and a title
sp2 <-
  subplot(p1, p2, nrows = 2, shareX=TRUE, shareY=TRUE, titleX=TRUE, titleY=TRUE) %>%
   layout(title="User score vs. critic score by platform, 2016")
## Warning: Ignoring 66 observations
## Warning: Ignoring 27 observations
sp2

Polishing axis titles

The axes in a subplot can be renamed using the layout() command, just like in a single plot; however, there are multiple x-axes to rename.

# Add x-axis and y-axis labels, and a title to  sp2
sp2 %>%
   layout(
     xaxis = list(title=""), 
     xaxis2 = list(title='Year'),
     yaxis = list(title="Global Sales (M units)"), 
     yaxis2 = list(title="Global Sales (M units)")
   )

Scatterplot Matrices

# Create a SPLOM of NA_Sales, EU_Sales, and JP_Sales
vgsales2016 %>%
  plot_ly() %>%
  add_trace(
    type = 'splom',
    dimensions = list(
      list(label = 'N. America', values = ~NA_Sales),
      list(label = 'Europe', values = ~EU_Sales),
      list(label = 'Japan', values = ~JP_Sales)
    )
  )

Customizing Color

add color to represent an additional variable in a scatterplot matrix.

# Color the SPLOM of NA_Sales, EU_Sales, and JP_Sales by nintendo
splom <-
  vgsales2016 %>%
    mutate(nintendo = ifelse(Publisher == "Nintendo", "Nintendo", "Other")) %>%
    plot_ly(color=~nintendo) %>% 
    add_trace(
      type='splom',                                  
      dimensions = list(
        list(label='N.America', values=~NA_Sales), 
        list(label='Europe', values=~EU_Sales),     
        list(label='Japan', values=~JP_Sales)       
      )
    )
splom
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Tweaking the appearance

Deleting the diagonal panels. Displaying only the upper or lower triangle of plots.

# Delete the diagonal plots in splom
splom %>%
   style(diagonal = list(visible = FALSE))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# Delete the plots in the upper half of splom
splom %>%
   style(showupperhalf = FALSE)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# Delete the plots in the lower half of splom
splom %>%
   style(showlowerhalf = FALSE)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Binned Scatterplot

help avoid overplotting create a binned scatterplot of User_Score against Critic_Score to display the entire dataset

 # Create a binned scatterplot of User_Score vs. Critic_Score
vgsales %>%
  plot_ly(x=~Critic_Score, y=~User_Score) %>%
  add_histogram2d(nbinsx=50, nbinsy=50)